Data provider country statistics

New OBIS datasets since 2021

Config

start_year <- 2021
end_year <- 2023

Dependencies

library(DBI)
library(dplyr)
library(glue)
library(countrycode)
library(ggplot2)

Load data

readRenviron("env.txt")

con <- dbConnect(RPostgres::Postgres(), dbname = Sys.getenv("OBIS_DB"), host = Sys.getenv("OBIS_HOST"), port = Sys.getenv("OBIS_PORT"), user = Sys.getenv("OBIS_USER"), password = Sys.getenv("OBIS_PASSWORD"), bigint = "integer")

res <- dbSendQuery(con, glue("
    select
        datasets.id,
        datasets.url,
        (datasets.statistics->>'Occurrence')::int as records,
        extract('year' from datasets.created) as year,
        persons.organization,
        persons.country as person_country,
        ih.country as inst_country,
        persons.oceanexpert_institution_id
    from datasets
    left join metadata.lnk_dataset_person ldp on ldp.dataset_id = datasets.id
    left join metadata.persons on ldp.person_id = persons.id
    left join metadata.institution_hierarchy ih on ih.id = persons.oceanexpert_institution_id
    order by (datasets.statistics->>'Occurrence')::int desc nulls last, datasets.id
"))
datasets <- dbFetch(res) %>%
  mutate(inst_country = stringr::str_replace(inst_country, "Réunion", "France")) %>%
  mutate(inst_country = stringr::str_replace(inst_country, "Wallis Futuna Islands", "Wallis and Futuna Islands")) %>%
  distinct()

Cleanup country names

country_codes <- countrycode::codelist %>% select(name = un.name.en, iso2c) %>%
  filter(!is.na(iso2c))

datasets <- datasets %>%
  left_join(country_codes, by = c("person_country" = "iso2c")) %>%
  rowwise() %>%
  mutate(country = ifelse(!is.na(inst_country), inst_country, name)) %>%
  mutate(across(where(is.character), ~na_if(., ""))) %>%
  filter(!is.na(country))

head(datasets)
## # A tibble: 6 × 10
## # Rowwise: 
##   id                url   records  year organization person_country inst_country
##   <chr>             <chr>   <int> <dbl> <chr>        <chr>          <chr>       
## 1 46389886-a09c-4c… http… 8331913  2023 Flanders Ma… BE             Belgium     
## 2 efa02fe9-6b5b-41… http… 4583637  2020 CSIRO Ocean… AU             Australia   
## 3 efa02fe9-6b5b-41… http… 4583637  2020 CSIRO Natio… AU             Australia   
## 4 80479e14-2730-43… http… 3134870  2020 U.S. Geolog… <NA>           United Stat…
## 5 80479e14-2730-43… http… 3134870  2020 National Oc… <NA>           United Stat…
## 6 8b0d5fdd-6a3f-48… http… 3071809  2020 School of E… AU             Australia   
## # ℹ 3 more variables: oceanexpert_institution_id <int>, name <chr>,
## #   country <chr>

Add region info

countries <- m49::m49_full %>% mutate(country_or_area = stringr::str_replace(country_or_area, "Turkey", "Türkiye")) %>% select(country_or_area, ldc, lldc, sids, developed, developing, region_name) %>%
  mutate(classification = factor(ifelse(sids, "SIDS", ifelse(region_name == "Africa", "Africa", "other")), levels = c("SIDS", "Africa", "other")))

datasets <- datasets %>%
  left_join(countries, by = c("country" = "country_or_area"))

Statistics

recent_datasets <- datasets %>%
  filter(year >= start_year & year <= end_year)

recent_stats <- recent_datasets %>%
  distinct(id, records, year, country, sids, region_name, classification) %>%
  filter(!is.na(country)) %>%
  group_by(country, year, sids, region_name, classification) %>%
  summarize(datasets = n(), records = sum(records)) %>%
  arrange(desc(records))

head(recent_stats)
## # A tibble: 6 × 7
## # Groups:   country, year, sids, region_name [6]
##   country                 year sids  region_name classification datasets records
##   <chr>                  <dbl> <lgl> <chr>       <fct>             <int>   <int>
## 1 Belgium                 2023 FALSE Europe      other                33  1.40e7
## 2 United States of Amer…  2021 FALSE Americas    other               162  6.78e6
## 3 Canada                  2021 FALSE Americas    other                45  6.57e6
## 4 Belgium                 2022 FALSE Europe      other                66  6.51e6
## 5 Netherlands             2022 FALSE Europe      other                14  3.94e6
## 6 Denmark                 2023 FALSE Europe      other                 4  3.15e6
write.csv(recent_stats, file = "output/stats.csv", row.names = FALSE)
recent_stats %>%
  relocate(country, year, records) %>% rmarkdown::paged_table(options = list(rows.print = 100))

Visualize

stats_all_years <- recent_stats %>%
  group_by(country) %>%
  summarize(records = sum(records))

recent_stats <- recent_stats %>%
  mutate(country = factor(country, levels = stats_all_years$country[order(stats_all_years$records)]))

ggplot(data = recent_stats) +
  geom_bar(aes(x = country, y = records, fill = classification), stat = "identity") +
  theme_minimal() +
  coord_flip() +
  scale_fill_manual(values = c("#e9933e", "#f3c654", "#cccccc")) +
  ggtitle("Biodiversity records contributed to OBIS by country") +
  scale_y_continuous(labels = scales::label_comma()) +
  facet_wrap(~year)

ggsave("output/graph_untransformed.png", width = 12, height = 7, dpi = 300, scale = 1.2, bg = "white")

ggplot(data = recent_stats) +
  geom_bar(aes(x = country, y = records, fill = classification), stat = "identity") +
  theme_minimal() +
  coord_flip() +
  scale_fill_manual(values = c("#e9933e", "#f3c654", "#cccccc")) +
  ggtitle("Biodiversity records contributed to OBIS by country") +
  scale_y_continuous(labels = scales::label_comma(), trans = "log10") +
  facet_wrap(~year)

ggsave("output/graph.png", width = 12, height = 7, dpi = 300, scale = 1.2, bg = "white")

Statistics

Records and datasets per year:

recent_datasets %>%
  distinct(id, year, records) %>%
  group_by(year) %>%
  summarize(records = sum(records, na.rm = TRUE), datasets = n())
## # A tibble: 3 × 3
##    year  records datasets
##   <dbl>    <int>    <int>
## 1  2021 17257846      651
## 2  2022 12984433      399
## 3  2023 26675354      442

Countries per year and classification:

recent_stats %>%
  group_by(year, classification) %>%
  summarize(records = sum(records, na.rm = TRUE), datasets = n(), countries = length(unique(country)))
## # A tibble: 9 × 5
## # Groups:   year [3]
##    year classification  records datasets countries
##   <dbl> <fct>             <int>    <int>     <int>
## 1  2021 SIDS              68960        4         4
## 2  2021 Africa             1982        3         3
## 3  2021 other          21806448       38        38
## 4  2022 SIDS            1054697       10        10
## 5  2022 Africa           252115        5         5
## 6  2022 other          17910142       43        43
## 7  2023 SIDS              87540        5         5
## 8  2023 Africa            23914        5         5
## 9  2023 other          33892262       33        33

All OBIS datasets so far

stats <- datasets %>%
  distinct(id, records, year, country, sids, region_name, classification) %>%
  filter(!is.na(country)) %>%
  group_by(country, year, sids, region_name, classification) %>%
  summarize(datasets = n(), records = sum(records)) %>%
  arrange(desc(records))

stats %>%
  group_by(country, classification) %>%
  summarize(min_year = min(year)) %>%
  group_by(classification, min_year) %>%
  summarize(countries = n()) %>%
  arrange(classification, !is.na(min_year), min_year) %>%
  group_by(classification) %>%
  mutate(cumulative_countries = cumsum(countries))
## # A tibble: 13 × 4
## # Groups:   classification [3]
##    classification min_year countries cumulative_countries
##    <fct>             <dbl>     <int>                <int>
##  1 SIDS                 NA        13                   13
##  2 SIDS               2019         1                   14
##  3 SIDS               2020         3                   17
##  4 SIDS               2021         1                   18
##  5 SIDS               2022         3                   21
##  6 Africa               NA        14                   14
##  7 Africa             2021         1                   15
##  8 Africa             2022         1                   16
##  9 other                NA        55                   55
## 10 other              2019         1                   56
## 11 other              2020         2                   58
## 12 other              2022         3                   61
## 13 other              2023         1                   62

BioEco GeoNode monitoring programmes

layers <- jsonlite::fromJSON("https://geonode.goosocean.org/api/layers/")$objects
stopifnot(nrow(layers) < 1000)
regions <- unlist(layers$regions)

Fix region names:

unique(regions[which(!regions %in% countries$country_or_area)])
##  [1] "Mediterranean Sea"               "Alaska"                         
##  [3] "California"                      "Baltic Sea"                     
##  [5] "Global"                          "Queensland"                     
##  [7] "Southern Ocean"                  "Turkey"                         
##  [9] "Newfoundland and Labrador"       "Faeroe Islands"                 
## [11] "United Kingdom"                  "Americas"                       
## [13] "Scotland"                        "Europe"                         
## [15] "North America"                   "Pacific"                        
## [17] "Baker Island"                    "Hawaii"                         
## [19] "Northwestern Hawaiian Islands"   "Howland Island"                 
## [21] "Jarvis Islands"                  "Johnston Atoll"                 
## [23] "Kingman Reef"                    "Micronesia, Federated States of"
## [25] "Midway Islands"                  "Palmyra Atoll"                  
## [27] "Wake Island"                     "South America"                  
## [29] "Caribbean"                       "Asia"                           
## [31] "Middle East"                     "Cape Verde"
regions <- plyr::revalue(regions, c(
  "United Kingdom" = "United Kingdom of Great Britain and Northern Ireland",
  "Scotland" = "United Kingdom of Great Britain and Northern Ireland",
  "Hawaii" = "United States of America",
  "Alaska" = "United States of America",
  "Queensland" = "Australia",
  "Queensland" = "Australia",
  "Newfoundland and Labrador" = "Canada",
  "California" = "United States of America",
  "Turkey" = "Türkiye",
  "Micronesia, Federated States of" = "Micronesia (Federated States of)",
  "Cape Verde" = "Cabo Verde",
  "Faeroe Islands" = "Faroe Islands",
  "Baker Island" = "United States of America",
  "Northwestern Hawaiian Islands" = "United States of America",
  "Howland Island" = "United States of America",
  "Jarvis Islands" = "United States of America",
  "Johnston Atoll" = "United States of America",
  "Kingman Reef" = "United States of America",
  "Micronesia, Federated States of" = "Micronesia (Federated States of)",
  "Midway Islands" = "United States of America",
  "Palmyra Atoll" = "United States of America",
  "Wake Island" = "United States of America",
  "Cape Verde" = "Cabo Verde"
))
bioeco_countries <- countries %>%
  filter(country_or_area %in% regions)

bioeco_countries %>%
  group_by(classification) %>%
  summarize(n())
## # A tibble: 4 × 2
##   classification `n()`
##   <fct>          <int>
## 1 SIDS              14
## 2 Africa             7
## 3 other             50
## 4 <NA>               1